import pandas as pd
import intake_cs109b_data_mma as cs109b_data
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode()
import plotly.express as px
from ipywidgets import widgets
import plotly.graph_objects as go
from plotly import figure_factory as ff
import numpy as np
import plotly.io as pio
import seaborn as sns
# pio.renderers.keys()
# pio.renderers.default = 'jupyterlab'
import matplotlib.pylab as plt
df = cs109b_data.original_data.read()
df.head()
df.groupby("PID")["Latest Budget Changes"].transform("count")
#assume data is sorted by date
df_unique = df.copy()
df_unique["num_budget_changes"] = df_unique.groupby("PID")["Latest Budget Changes"].transform("count")
df_unique["num_schedule_changes"] = df_unique.groupby("PID")["Latest Schedule Changes"].transform("count")
df_unique["Latest Budget Changes"] = df_unique["Latest Budget Changes"].fillna(0)
#some projects has latest budget changes as first row so need to calculate the original budget
df_unique = df_unique.eval("budget_original = `Latest Budget Changes` + `Budget Forecast`").eval("schedule_original= @pd.to_timedelta(`Latest Schedule Changes`, unit='days') + `Forecast Completion`").drop_duplicates("PID").drop(columns = ["Latest Budget Changes","Latest Schedule Changes"])
df_unique = df_unique.eval("budget_actual = budget_original + `Total Budget Changes`").eval("schedule_actual= schedule_original + @pd.to_timedelta(`Total Schedule Changes`, unit='days')")
df_unique
Not all of projects start with design phase
df_unique.query("`Current Phase` == 'Design'").PID.nunique()
df_unique = df_unique.eval('budget_abs_per_error = abs(budget_original - budget_actual)/budget_actual')\
.eval('budget_rel_per_error = abs(budget_original - budget_actual)/budget_original')\
.eval('budget_ratios = budget_actual/budget_original')\
.eval('days_changed_in_schedule = schedule_actual - schedule_original')
df_unique
(df_unique["Total Schedule Changes"] - df_unique["days_changed_in_schedule"].dt.days)[(df_unique["Total Schedule Changes"] - df_unique["days_changed_in_schedule"].dt.days).notnull()].sum()
df_unique.PID.nunique()
df.PID.nunique()
df.Category.nunique()
df.info()
df.describe()
df.columns
profile = ProfileReport(df, title='Pandas Profiling Report', html={'style':{'full_width':True}})
df.drop_duplicates("Project Name").Description.value_counts()
Project Name and Description is not a 1 to 1 match, some projects have more than 1 description (?)
Multiple projects can be designed at the same day (10% of projects overlap)
1 Project has project ID but no name
Reports rise steadily from 2014-2018. More than half of reports are in 2019
28 Projects have same description: "Design and construction of a new school". 3 Design and construction of an addition to an existing school. 3 Design and construction of right-of-way green infrastructure in order for the city to comply with NYS DEC consent order which states that NYC must manage 1” of rain on 10% of impervious surfaces by 2030 to reduce combined sewer overflows. The others have unique descriptions
Design start looks bimodal, one around 2000 - 2008, other from 2012 - 2020
Budget forecast varies widely. Most small but a couple of very big ones. One negative budget forecast
Some outliers need to remove to visualize budget metrics
df_dummy_unique)profile.to_widgets()
profile.to_notebook_iframe()
profile.to_file(output_file="Capital_Projects_EDA_An.html")
unique_profile = ProfileReport(df_unique, title='Unique Projects Profiling Report', html={'style':{'full_width':True}})
unique_profile.to_widgets()
unique_profile.to_notebook_iframe()